import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import pandas as pd


def main():
    LOGGER.info('Begin b_combining_datasets_for_regressions.py')
    # Cleaning panel structure for patenting
    dffirmyear = pd.read_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Panel_OEMs_level1_with_subsi_FamInfo.csv')
    dffirmyear = dffirmyear.rename(columns={'earliest_filing_year': 'Year'})
    main_col = ['OEM_Level1_ID', 'name', 'Year', 'Count', 'CountClean', 'CountDirty', 'CountGrey', 'Stock', 'StockClean']
    bounds = Find_FirstandLast_Observation_foreachfirm(dffirmyear[main_col])
    df_template = Create_ImplicitPanelTemplate(bounds)
    dffirmyear = Add_ImplicitObservations(dffirmyear, df_template)
    dffirmyear = Create_Bat_FC_var(dffirmyear)
    build_jurisdiction_year_panel(dffirmyear)
    dffirmyear = Add_Policy_Variables(dffirmyear)
    dffirmyear = Add_SupplierNetworkVariables(dffirmyear)
    dffirmyear.to_csv(PATHS.dropbox / 'Data_outputted/E_Analysis/oems_panel_regressions.csv', index=False)
    LOGGER.info('End b_combining_datasets_for_regressions.py')



def build_jurisdiction_year_panel(dffirmyear):
    dffirmcountryyear = pd.read_csv(PATHS.dropbox / 'Data_outputted/D_Policy/countryyear_policies_and_oem_shares.csv')
    dffirmcountryyear = dffirmcountryyear.merge(dffirmyear[['OEM_Level1_ID','Year','diff_share_bat_fc','diff_share_bat_fc_0imputed','CountClean','Count_FC','Count_Bat']],on = ['Year','OEM_Level1_ID'])
    # note that for some years and some countries, some of the OEMs selling in that jurisdiction don't have an observation in the dffirmyear dataset (e.g. USA in 1995, Navistar, Porsche and Chrysler group) --> may be an error
    for col in ['diff_share_bat_fc','diff_share_bat_fc_0imputed','CountClean','Count_FC','Count_Bat']:
        col_newname = col.split('_code')[0] + '_exposure'
        # we use the share of sales in country i as a weight. on the next line we multiply and then after we agg by summing so it's a weighted average
        dffirmcountryyear[col_newname] = dffirmcountryyear['Share_of_OEM_for_CountrySales'] * dffirmcountryyear[col]
    list_cols = ['diff_share_bat_fc_exposure', 'diff_share_bat_fc_0imputed_exposure', 'CountClean_exposure','Count_FC_exposure','Count_Bat_exposure','Share_of_OEM_for_CountrySales']
    dfcountryyear = dffirmcountryyear.groupby(['Country', "Year"])[list_cols].sum()
    dfcountryyear_policies = dffirmcountryyear[['Country', 'Year', 'strat_fc_code','strat_bev_code', 'strat_focus_code', 'fchydrogen','otherstorage']].drop_duplicates()
    # add policy vars back in:
    dfcountryyear = dfcountryyear.merge(dfcountryyear_policies, on=['Country','Year'])
    dfcountryyear.to_csv(PATHS.dropbox / 'Data_outputted/E_Analysis/country_panel_regressions.csv',index = False)


def Find_FirstandLast_Observation_foreachfirm(dffirmyear):
    # FIND FIRST OBS AND LAST OBS FOR EACH FIRM
    bounds = pd.DataFrame(dffirmyear.groupby(['OEM_Level1_ID'])['Year'].min().rename('first_obs'))
    bounds['last_obs'] = dffirmyear.groupby(['OEM_Level1_ID'])['Year'].max()
    bounds = bounds.astype(int)
    bounds['YearsActive'] = bounds['last_obs'] - bounds['first_obs']
    bounds['first_obs'].value_counts().sort_index()
    # the last firm enters in 2011. ie 2011 onwards, all firms have entered.
    bounds['last_obs'].value_counts().sort_index()
    bounds['name'] = dffirmyear[['OEM_Level1_ID', 'name']].drop_duplicates().set_index('OEM_Level1_ID')
    bounds = bounds.sort_values(by='first_obs')
    bounds.to_csv(PATHS.dropbox / 'Data_outputted/E_Analysis/bounds_timeseries_oems.csv')
    return bounds


def Create_ImplicitPanelTemplate(bounds):
    # i.e. create a dataset where for each firm, we document all the years for which we implicitly observe the firm
    # i.e. if for firm i, first year is 2005 and last 2010, but the firm only has patents in 2005, 2007 and 2010
    # we conclude that we implicitly observe the firm non-stop from 2005 to 2010.
    # later we will add zeros for the years 2006, 2008 and 2009
    ind = [(i, y) for i in bounds.index for y in list(range(bounds.loc[i, 'first_obs'], bounds.loc[i, 'last_obs'] + 1))]
    ind = pd.MultiIndex.from_tuples(ind, names=['OEM_Level1_ID', 'Year'])
    df_template = pd.DataFrame(index=ind).reset_index()
    # df_template is a dataframe that has only 2 columns: 'yet_id', 'appln_filing_year'
    df_template = df_template.merge(bounds['first_obs'], on='OEM_Level1_ID', how='left')
    df_template['YearsActive'] = df_template['Year'] + 1 - df_template['first_obs']
    df_template = df_template.drop(columns='first_obs')
    # Template from 1990 to 2016. exclude 2017 cause too much attrition
    df_template = df_template[df_template['Year'].isin(range(1990, 2017))]
    return df_template


def Add_ImplicitObservations(dffirmyear, df_template):
    # MERGE PATENTING DATA ON PANEL TEMPLATE
    dffirmyear = df_template.merge(dffirmyear, on=['OEM_Level1_ID', 'Year'], how='left', indicator=True)
    del dffirmyear['_merge']
    # For COUNT variables, FILL MISSING VALUES WITH ZEROS for all years where we know the firm was around but did not patent
    # basically, if there is a row in the data, it means we know the firm had been patenting before or starting in that year
    # hence we can input a zero
    # I'm not including things like 'MeanFamSize' cause this should still be NaN in years where there s no patent
    list_var_to_input_zeros = [c for c in dffirmyear.columns if 'Count' in c]
    dffirmyear[list_var_to_input_zeros] = dffirmyear[list_var_to_input_zeros].fillna(0)
    # FOR CUMULATIVE VARIABLES, FILL IN WITH VALUE FROM PREVIOUS YEAR
    list_var_cumulative = [c for c in dffirmyear.columns if 'Stock' in c]
    dffirmyear = dffirmyear.sort_values(['OEM_Level1_ID', 'Year'])
    dffirmyear[list_var_cumulative] = dffirmyear.groupby('OEM_Level1_ID')[list_var_cumulative].fillna(method="ffill")
    return dffirmyear


def Create_Bat_FC_var(dffirmyear):
    OEMs = pd.read_csv(PATHS.dropbox / 'Data_outputted/A_AutoIndustry/OEMs_shortenednamesforgraphs.csv')
    dffirmyear = dffirmyear.merge(OEMs[['OEM_Level1_ID', 'shortname']].drop_duplicates(), on='OEM_Level1_ID')
    dffirmyear['ratio_bat_fc'] = (dffirmyear['Count_Bat'] + 1) / (dffirmyear['Count_FC'] + 1)
    dffirmyear['ratio_bat_fc_excl'] = (dffirmyear['Count_Bat_excl'] + 1) / (dffirmyear['Count_FC_excl'] + 1)
    dffirmyear['share_bat'] = dffirmyear['Count_Bat'] / dffirmyear['Count_CleanCar']
    dffirmyear['share_bat_excl'] = dffirmyear['Count_Bat_excl'] / dffirmyear['Count_CleanCar_excl']
    dffirmyear['share_fc'] = dffirmyear['Count_FC'] / dffirmyear['Count_CleanCar']
    dffirmyear['share_fc_excl'] = dffirmyear['Count_FC_excl'] / dffirmyear['Count_CleanCar_excl']
    dffirmyear['diff_share_bat_fc'] = dffirmyear['share_bat'] - dffirmyear['share_fc']
    dffirmyear['diff_share_bat_fc_0imputed'] = dffirmyear['diff_share_bat_fc'].fillna(0)
    dffirmyear['diff_share_bat_fc_excl'] = dffirmyear['share_bat_excl'] - dffirmyear['share_fc_excl']
    dffirmyear['share_greydirty'] = (dffirmyear['CountDirty'] + dffirmyear['CountGrey']) / dffirmyear['Count']
    dffirmyear['share_greydirty_excl'] = (dffirmyear['CountDirty_excl'] + dffirmyear['CountGrey_excl']) / dffirmyear['Count']
    dffirmyear['share_clean'] = dffirmyear['CountClean'] / dffirmyear['Count']
    dffirmyear['share_clean_excl'] = dffirmyear['CountClean_excl'] / dffirmyear['Count']
    dffirmyear['share_clean_in_energy'] = dffirmyear['CountClean'] / (dffirmyear['CountDirty'] + dffirmyear['CountGrey'] + dffirmyear['CountClean'])
    dffirmyear['share_clean_in_energy_excl'] = dffirmyear['CountClean_excl'] / (dffirmyear['CountDirty_excl'] + dffirmyear['CountGrey_excl'] + dffirmyear['CountClean_excl'])
    dffirmyear['ratio_cleancar_v_ice'] = (dffirmyear['Count_CleanCar'] + 1) / (dffirmyear['Count_EffICE'] + dffirmyear['Count_ICE'] + 1)
    dffirmyear['ratio_cleancar_v_ice_excl'] = (dffirmyear['Count_CleanCar_excl'] + 1) / (dffirmyear['Count_EffICE_excl'] + dffirmyear['Count_ICE_excl'] + 1)
    dffirmyear['share_cleancar'] = dffirmyear['Count_CleanCar'] / dffirmyear['Count']
    dffirmyear['share_cleancar_excl'] = dffirmyear['Count_CleanCar_excl'] / dffirmyear['Count']
    dffirmyear['share_cleancar_in_energy'] = dffirmyear['Count_CleanCar'] /(dffirmyear['Count_EffICE'] + dffirmyear['Count_ICE'] + dffirmyear['Count_CleanCar'] )
    dffirmyear['share_cleancar_in_energy_excl'] = dffirmyear['Count_CleanCar_excl'] /(dffirmyear['Count_EffICE_excl'] + dffirmyear['Count_ICE_excl'] + dffirmyear['Count_CleanCar_excl'] )
    dffirmyear['share_ice'] = (dffirmyear['Count_ICE'] + dffirmyear['Count_EffICE']) / dffirmyear['Count']
    dffirmyear['share_ice_excl'] = (dffirmyear['Count_ICE_excl'] + dffirmyear['Count_EffICE_excl']) / dffirmyear['Count']
    dffirmyear = dffirmyear[dffirmyear['OEM_Level1_ID'] != 31]
    return dffirmyear


def Add_Policy_Variables(dffirmyear):
    # Add policy variables
    df_policy = pd.read_csv(PATHS.dropbox / 'Data_outputted/D_Policy/firm_year_policy_exposure.csv')
    dffirmyear = dffirmyear.merge(df_policy, on=['Year', 'OEM_Level1_ID'], how='left')
    return dffirmyear


def Add_SupplierNetworkVariables(dffirmyear):
    SC_ProdPanel = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/SC_ProdPanel.csv')
    cols = ['shortname', 'Level1_MLName', 'name', 'OEM_Level1_ID']
    dffirmyear = dffirmyear[cols + [k for k in dffirmyear.columns if k not in cols]]
    SC_ProdPanel = SC_ProdPanel.drop(columns='Level1_MLName')
    dffirmyear = dffirmyear.merge(SC_ProdPanel, on=['Year', 'OEM_Level1_ID'], how='left')
    return dffirmyear

